import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
path= "C:\\Users\\gaurav\\Desktop\\Sales_Data"
files= [ file for file in os.listdir(path)]
all_months_data = pd.DataFrame()
for file in files:
df=pd.read_csv(path+"/"+file)
all_months_data = pd.concat([all_months_data, df])
all_months_data.to_csv("all_data_copy.csv", index=False)
all_months_data.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | |
|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 04/19/19 08:46 | 917 1st St, Dallas, TX 75001 |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 04/07/19 22:30 | 682 Chestnut St, Boston, MA 02215 |
| 3 | 176560 | Google Phone | 1 | 600 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
| 4 | 176560 | Wired Headphones | 1 | 11.99 | 04/12/19 14:38 | 669 Spruce St, Los Angeles, CA 90001 |
all_months_data.isnull().sum() #Finding nan values
Order ID 545 Product 545 Quantity Ordered 545 Price Each 545 Order Date 545 Purchase Address 545 dtype: int64
all_months_data.dropna(axis=0,inplace=True) # removing nan values
all_months_data.reset_index(inplace=True)
all_months_data.drop(['index'],axis=1,inplace=True)
all_months_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 186305 entries, 0 to 186304 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 186305 non-null object 1 Product 186305 non-null object 2 Quantity Ordered 186305 non-null object 3 Price Each 186305 non-null object 4 Order Date 186305 non-null object 5 Purchase Address 186305 non-null object dtypes: object(6) memory usage: 8.5+ MB
all_months_data = all_months_data[all_months_data['Order Date'].str[0:2]!='Or']
all_months_data['Order Date'] = pd.to_datetime(all_months_data['Order Date'])
all_months_data['Price Each'] = pd.to_numeric(all_months_data['Price Each'])
all_months_data['Year']=all_months_data['Order Date'].dt.year
all_months_data['Month']=all_months_data['Order Date'].dt.month
all_months_data['Quantity Ordered']= pd.to_numeric(all_months_data['Quantity Ordered'])
all_months_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 185950 entries, 0 to 186304 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity Ordered 185950 non-null int64 3 Price Each 185950 non-null float64 4 Order Date 185950 non-null datetime64[ns] 5 Purchase Address 185950 non-null object 6 Year 185950 non-null int64 7 Month 185950 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(3), object(3) memory usage: 12.8+ MB
all_months_data['Sales'] = all_months_data['Quantity Ordered']*all_months_data['Price Each']
all_months_data.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Year | Month | Sales | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 2019-04-19 08:46:00 | 917 1st St, Dallas, TX 75001 | 2019 | 4 | 23.90 |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 2019-04-07 22:30:00 | 682 Chestnut St, Boston, MA 02215 | 2019 | 4 | 99.99 |
| 2 | 176560 | Google Phone | 1 | 600.00 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | 2019 | 4 | 600.00 |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | 2019 | 4 | 11.99 |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 2019-04-30 09:27:00 | 333 8th St, Los Angeles, CA 90001 | 2019 | 4 | 11.99 |
all_months_data['Purchase Address']= pd.Series(all_months_data['Purchase Address'],dtype="string")
all_months_data.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 185950 entries, 0 to 186304 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 185950 non-null object 1 Product 185950 non-null object 2 Quantity Ordered 185950 non-null int64 3 Price Each 185950 non-null float64 4 Order Date 185950 non-null datetime64[ns] 5 Purchase Address 185950 non-null string 6 Year 185950 non-null int64 7 Month 185950 non-null int64 8 Sales 185950 non-null float64 dtypes: datetime64[ns](1), float64(2), int64(3), object(2), string(1) memory usage: 14.2+ MB
all_months_data['City']=all_months_data['Purchase Address'].str.split(',',expand=True)[1]
result=all_months_data.groupby('City')['Sales'].sum().reset_index().sort_values('Sales')
import plotly.express as px
fig = px.bar(result, x='City', y='Sales',
hover_data=['Sales'], color='Sales',
labels={'pop':'Sales'}, height=400)
fig.show()
month_sales=all_months_data.groupby('Month')['Sales'].sum().to_frame('Sales').reset_index()
month_sales
| Month | Sales | |
|---|---|---|
| 0 | 1 | 1822256.73 |
| 1 | 2 | 2202022.42 |
| 2 | 3 | 2807100.38 |
| 3 | 4 | 3390670.24 |
| 4 | 5 | 3152606.75 |
| 5 | 6 | 2577802.26 |
| 6 | 7 | 2647775.76 |
| 7 | 8 | 2244467.88 |
| 8 | 9 | 2097560.13 |
| 9 | 10 | 3736726.88 |
| 10 | 11 | 3199603.20 |
| 11 | 12 | 4613443.34 |
x=month_sales['Month']
y=month_sales['Sales']
fig = px.line(month_sales,'Month','Sales')
fig.show()
all_months_data['Hour']=all_months_data['Order Date'].dt.hour
hourly_sales=all_months_data.groupby('Hour')['Sales'].count().to_frame('Sales by hour').reset_index()
hourly_sales
| Hour | Sales by hour | |
|---|---|---|
| 0 | 0 | 3910 |
| 1 | 1 | 2350 |
| 2 | 2 | 1243 |
| 3 | 3 | 831 |
| 4 | 4 | 854 |
| 5 | 5 | 1321 |
| 6 | 6 | 2482 |
| 7 | 7 | 4011 |
| 8 | 8 | 6256 |
| 9 | 9 | 8748 |
| 10 | 10 | 10944 |
| 11 | 11 | 12411 |
| 12 | 12 | 12587 |
| 13 | 13 | 12129 |
| 14 | 14 | 10984 |
| 15 | 15 | 10175 |
| 16 | 16 | 10384 |
| 17 | 17 | 10899 |
| 18 | 18 | 12280 |
| 19 | 19 | 12905 |
| 20 | 20 | 12228 |
| 21 | 21 | 10921 |
| 22 | 22 | 8822 |
| 23 | 23 | 6275 |
x=hourly_sales['Hour']
y=hourly_sales['Sales by hour']
fig = px.line(hourly_sales, x='Hour', y="Sales by hour",markers=True)
fig.show()
product_quantity=all_months_data.groupby('Product')['Quantity Ordered'].sum().reset_index().sort_values(by='Quantity Ordered',ascending=False)
import plotly.express as px
fig = px.bar(product_quantity, x='Product', y='Quantity Ordered',
hover_data=['Product'], color='Product',title="Quantitywise product",
labels={'pop':'Quantity Ordered'}, height=500)
fig.show()
all_months_data.head()
| Order ID | Product | Quantity Ordered | Price Each | Order Date | Purchase Address | Year | Month | Sales | City | Hour | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 176558 | USB-C Charging Cable | 2 | 11.95 | 2019-04-19 08:46:00 | 917 1st St, Dallas, TX 75001 | 2019 | 4 | 23.90 | Dallas | 8 |
| 1 | 176559 | Bose SoundSport Headphones | 1 | 99.99 | 2019-04-07 22:30:00 | 682 Chestnut St, Boston, MA 02215 | 2019 | 4 | 99.99 | Boston | 22 |
| 2 | 176560 | Google Phone | 1 | 600.00 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | 2019 | 4 | 600.00 | Los Angeles | 14 |
| 3 | 176560 | Wired Headphones | 1 | 11.99 | 2019-04-12 14:38:00 | 669 Spruce St, Los Angeles, CA 90001 | 2019 | 4 | 11.99 | Los Angeles | 14 |
| 4 | 176561 | Wired Headphones | 1 | 11.99 | 2019-04-30 09:27:00 | 333 8th St, Los Angeles, CA 90001 | 2019 | 4 | 11.99 | Los Angeles | 9 |
most_selling=all_months_data.groupby('Product')['Product'].count().to_frame('No_of_sales').reset_index()
import plotly.express as px
fig = px.bar(most_selling, x='Product', y='No_of_sales',
hover_data=['Product'], color='Product',
labels={'pop':'No_of_sales'}, height=500)
fig.show()
1.Most sales are in the city of san fransico and lowest sale in city Austin where we have to focus on sales.
2.Peak shopping hours of customers is between 12 to 20 which shows it is the most perferable time of customers and all extra operational work can be done before 12.
3.December is best month of sales and sales where down between july to september month.